#!/usr/bin/env python
#===- gen_std.py -  ------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#

"""gen_std.py is a tool to generate a lookup table (from qualified names to
include headers) for C/C++ Standard Library symbols by parsing archived HTML
files from cppreference.

The generated files are located in clang/include/Tooling/Inclusions.

Caveats and FIXMEs:
  - only symbols directly in "std" namespace are added, we should also add std's
    subnamespace symbols (e.g. chrono).
  - symbols with multiple variants or defined in multiple headers aren't added,
    e.g. std::move, std::swap

Usage:
  1. Install BeautifulSoup dependency, see instruction:
       https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
  2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at
       https://en.cppreference.com/w/Cppreference:Archives
  3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should
     get a "cppreference/reference" directory.
  4. Run the command:
       // Generate C++ symbols
       python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc
       // Generate C symbols
       python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc
"""


import cppreference_parser
import argparse
import datetime
import os
import sys
import re


CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for %s
// Standard Library symbols.
//
// This file was generated automatically by
// clang/tools/include-mapping/gen_std.py, DO NOT EDIT!
//
// Generated from cppreference offline HTML book (modified on %s).
//===----------------------------------------------------------------------===//
"""

def ParseArg():
  parser = argparse.ArgumentParser(description='Generate StdGen file')
  parser.add_argument('-cppreference', metavar='PATH',
                      default='',
                      help='path to the cppreference offline HTML directory',
                      required=True
                     )
  parser.add_argument('-symbols',
                      default='cpp',
                      help='Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.',
                      required=True) 
  return parser.parse_args()

def AdditionalHeadersForIOSymbols(symbol):
  # IO-related symbols declared in the <iosfwd> header, per C++
  # [iosfwd.syn 31.3.1]:
  iosfwd_symbols = [
      'basic_ios',
      'basic_streambuf',
      'basic_istream',
      'basic_ostream',
      'basic_iostream',

      'basic_stringbuf',
      'basic_istringstream',
      'basic_ostringstream',
      'basic_stringstream',

      'basic_spanbuf',
      'basic_ispanstream',
      'basic_ospanstream',
      'basic_spanstream',

      'basic_filebuf',
      'basic_ifstream',
      'basic_ofstream',
      'basic_fstream',

      'basic_syncbuf',
      'basic_osyncstream',

      'istreambuf_iterator',
      'ostreambuf_iterator',

      'ios',
      'wios',

      'streambuf',
      'istream',
      'ostream',
      'iostream',

      'stringbuf',
      'istringstream',
      'ostringstream',
      'stringstream',

      'spanbuf',
      'ispanstream',
      'ospanstream',
      'spanstream',

      'filebuf',
      'ifstream',
      'ofstream',
      'fstream',

      'syncbuf',
      'osyncstream',

      'wstreambuf',
      'wistream',
      'wostream',
      'wiostream',

      'wstringbuf',
      'wistringstream',
      'wostringstream',
      'wstringstream',

      'wspanbuf',
      'wispanstream',
      'wospanstream',
      'wspanstream',

      'wfilebuf',
      'wifstream',
      'wofstream',
      'wfstream',

      'wsyncbuf',
      'wosyncstream',

      'fpos',
      'streampos',
      'wstreampos',
      'u8streampos',
      'u16streampos',
      'u32streampos',
  ]
  assert(len(symbol.headers) == 1)
  sym_header = symbol.headers[0]
  headers = []
  # <iostream> is preferred than <iosfwd>

  # <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>.
  # per C++ [iostream.syn 31.4.1]
  if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]:
    headers.append("<iostream>")

  if symbol.name in iosfwd_symbols:
    headers.append("<iosfwd>")

  return headers


def GetCCompatibilitySymbols(symbol):
   # C++ form of the C standard headers.
  c_compat_headers = {
    "<cassert>",
    "<cctype>",
    "<cerrno>",
    "<cfenv>",
    "<cfloat>",
    "<cinttypes>",
    "<climits>",
    "<clocale>",
    "<cmath>",
    "<csetjmp>",
    "<csignal>",
    "<cstdarg>",
    "<cstddef>",
    "<cstdint>",
    "<cstdio>",
    "<cstdlib>",
    "<cstring>",
    "<ctime>",
    "<cuchar>",
    "<cwchar>",
    "<cwctype>",
  }
  # C++ [support.c.headers.other] 17.14.7
  #    ..., behaves as if each name placed in the standard library namespace by
  #    the corresponding <cname> header is placed within the global namespace
  #    scope, except for the functions described in [sf.cmath], the
  #    std​::​lerp function overloads ([c.math.lerp]), the declaration of
  #    std​::​byte ([cstddef.syn]), and the functions and function templates
  #    described in [support.types.byteops].
  exception_symbols = {
    "(assoc_)?laguerre[f|l]?",
    "(assoc_|sph_)?legendre[f|l]?",
    "beta[f|l]?",
    "(comp_)?ellint_[1-3][f|l]?",
    "(cyl_|sph_)?bessel_[i-k][f|l]?",
    "(cyl_|sph_)?neumann[f|l]?",
    "expint[f|l]?",
    "hermite[f|l]?",
    "riemann_zeta[f|l]?",
    "lerp",
    "byte",
  }
  assert(len(symbol.headers) == 1)
  header = symbol.headers[0]
  if header not in c_compat_headers:
    return []
  if any(re.fullmatch(x, symbol.name) for x in exception_symbols):
    return []

  # Introduce two more entries, both in the global namespace, one using the
  # C++-compat header and another using the C header.
  results = []
  if symbol.namespace != None:
    # avoid printing duplicated entries, for C macros!
    results.append(cppreference_parser.Symbol(symbol.name, None, [header]))
  c_header = "<" + header[2:-1] +  ".h>" # <cstdio> => <stdio.h>
  results.append(cppreference_parser.Symbol(symbol.name, None, [c_header]))
  return results


def main():
  args = ParseArg()
  if args.symbols == 'cpp':
    page_root = os.path.join(args.cppreference, "en", "cpp")
    symbol_index_root = os.path.join(page_root, "symbol_index")
    parse_pages =  [
      (page_root, "symbol_index.html", "std::"),
      # std sub-namespace symbols have separated pages.
      # We don't index std literal operators (e.g.
      # std::literals::chrono_literals::operator""d), these symbols can't be
      # accessed by std::<symbol_name>.
      #
      # std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc
      (symbol_index_root, "chrono.html", "std::chrono::"),
      (symbol_index_root, "execution.html", "std::execution::"),
      (symbol_index_root, "numbers.html", "std::numbers::"),
      (symbol_index_root, "filesystem.html", "std::filesystem::"),
      (symbol_index_root, "pmr.html", "std::pmr::"),
      (symbol_index_root, "ranges.html", "std::ranges::"),
      (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
      (symbol_index_root, "this_thread.html", "std::this_thread::"),
      # Zombie symbols that were available from the Standard Library, but are
      # removed in the following standards.
      (symbol_index_root, "zombie_names.html", "std::"),
      (symbol_index_root, "macro.html", None),
    ]
  elif args.symbols == 'c':
    page_root = os.path.join(args.cppreference, "en", "c")
    symbol_index_root = page_root
    parse_pages = [(page_root, "index.html", None)]  
    
  if not os.path.exists(symbol_index_root):
    exit("Path %s doesn't exist!" % symbol_index_root)

  symbols = cppreference_parser.GetSymbols(parse_pages)
  
  # We don't have version information from the unzipped offline HTML files.
  # so we use the modified time of the symbol_index.html as the version.
  index_page_path = os.path.join(page_root, "index.html")
  cppreference_modified_date = datetime.datetime.fromtimestamp(
    os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
  print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date))
  for symbol in symbols:
    if len(symbol.headers) == 1:
      augmented_symbols = [symbol]
      augmented_symbols.extend(GetCCompatibilitySymbols(symbol))
      for s in augmented_symbols:
        s.headers.extend(AdditionalHeadersForIOSymbols(s))
        for header in s.headers:
          # SYMBOL(unqualified_name, namespace, header)
          print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace,
                                        header))
    elif len(symbol.headers) == 0:
      sys.stderr.write("No header found for symbol %s\n" % symbol.name)
    else:
      # FIXME: support symbols with multiple headers (e.g. std::move).
      sys.stderr.write("Ambiguous header for symbol %s: %s\n" % (
          symbol.name, ', '.join(symbol.headers)))


if __name__ == '__main__':
  main()
